from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit"
value="Click here to toggle on/off the raw code."></form>''')
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
""")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import os.path
import matplotlib.patches as mpatches
import sqlite3
import datetime
import nltk
nltk.download('stopwords', quiet=True)
import plotly.express as px
from nltk.corpus import stopwords
from IPython.display import Image
from IPython.core.display import HTML
from wordcloud import WordCloud
from sqlalchemy import create_engine
from sqlalchemy.engine.reflection import Inspector
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
plt.rcParams["figure.figsize"] = (12, 8)
Image(filename="TED banner.png")
| Table Name | Description | Rows | Columns |
|---|---|---|---|
| YouTube | Contains cleaned and organized scraped data from YouTube | 460 | 12 |
| Spotify | Contains cleaned and organized scraped data from Spotify | 309 | 7 |
| Feature | Data Type | Description |
|---|---|---|
| full_title_author | str | Published title of video |
| title | str | Official title of video |
| author | str | Name of speaker |
| published_at | object | Date published on YouTube |
| description | str | Video description |
| video_id | str | Unique identifier per video |
| duration | str | Duration of video |
| viewCount | int64 | Total number of video views |
| likeCount | int64 | Total numer of video likes |
| dislikeCount | int64 | Total numer of video dislikes |
| commentCount | int64 | Total numer of video comments |
| Feature | Data Type | Description |
|---|---|---|
| full_title_author | str | Published title of podcast |
| title | str | Official title of podcast |
| author | str | Name of speaker |
| description | str | Podcast description |
| date_released | object | Date released on Spotify |
| duration_ms | str | Duration of podcast in seconds |
| duration_min | float | Duration of podcast in minutes |
Image(filename="Method.png")
#Get video list from YouTube API
token = None
publishedAt = []
titles = []
authors = []
description = []
video_id = []
titles_authors = []
while token != 'stop':
res = requests.get(
'https://www.googleapis.com/youtube/v3/activities',
params={
'key': youtube_api_key,
'part': 'snippet, contentDetails',
'channelId': 'UCAuUUnT6oDeKwE6v1NGQxug',
'maxResults' : 500,
'fields' : 'items(snippet/title),'
'items(snippet/description),'
'items(snippet/publishedAt),'
'nextPageToken,'
'items(contentDetails),'
'pageInfo,'
'items(snippet/type)',
'publishedAfter':"2020-01-01T00:00:00.0Z",
'pageToken' : token
},
proxies=proxies
).json()
for i in range(len(res['items'])):
#check type if upload or playlist item
if res['items'][i]['snippet']['type'] == 'upload':
#get published date
publishedAt.append(res['items'][i]['snippet']['publishedAt'])
#get titles & authors
if 'title' in res['items'][i]['snippet'].keys():
try:
entry = res['items'][i]['snippet']['title']
title_author = entry.split(' | ',1)
titles_authors.append(entry)
titles.append(title_author[0])
authors.append(title_author[1])
except IndexError:
authors.append('')
else:
titles.append('')
authors.append('')
titles_authors.append('')
#get description
if 'description' in res['items'][i]['snippet'].keys():
description.append(res['items'][i]['snippet']['description'])
else:
description.append('')
#get video ID
if 'videoId' in res['items'][i]['contentDetails']['upload'].keys():
video_id.append(res['items'][i]['contentDetails']['upload']['videoId'])
else:
video_id.append('')
else:
pass
#check for next page
try:
token = res['nextPageToken']
except (KeyError):
token = 'stop'
# Store in DataFrame
df_youtube1 = pd.DataFrame({'full_title_author' : titles_authors,
'title' : titles,
'author' : authors,
'published_at' : publishedAt,
'description' : description,
'video_id' : video_id
})
#Get video details from YouTube API
youtube_details = []
stats = ['viewCount',
'likeCount',
'dislikeCount',
'favoriteCount',
'commentCount']
for video_id in df_youtube1['video_id'].tolist():
res_details = requests.get(
'https://www.googleapis.com/youtube/v3/videos',
params={
'key': youtube_api_key,
'part': 'statistics, contentDetails',
'id': video_id
},
proxies=proxies
).json()
#get duration
duration = res_details['items'][0]['contentDetails']['duration']
details = {'video_id': video_id, 'duration' : duration}
# get statistics
for stat in stats:
try:
details[stat] = res_details['items'][0]['statistics'][stat]
except KeyError:
details[stat] = None
youtube_details.append(details)
# Get data using Spotify API
headers = {
'Authorization': 'Bearer {token}'.format(token=access_token),
'Accept': 'application/json',
"Content-Type": "application/json"}
# TED Talks Channel in Spotify
get_path = ("https://api.spotify.com/v1/shows/"
"1VXcH8QHkjRcTCEd88U3ti/episodes?market=US&limit=50")
r = requests.get(get_path,
headers=headers,
proxies=proxies).json()
spotify_ted = []
for i in range(len(r['items'])):
spotify_ted.append([r['items'][i]['name'],
re.findall(r'^(.+) \|', r['items'][i]['name'])[0],
re.findall(r'\| (.+)$', r['items'][i]['name'])[0],
r['items'][i]['description'],
r['items'][i]['release_date'],
r['items'][i]['duration_ms'],
r['items'][i]['duration_ms']/60_000])
# Get data using Spotify API (succeeding pages)
get_path = r['next']
while get_path is not None:
r = requests.get(get_path,
headers=headers,
proxies=proxies
).json()
for i in range(len(r['items'])):
time.sleep(0.1)
spotify_ted.append([r['items'][i]['name'],
re.findall(r'^(.+) \|', r['items'][i]['name'])[0],
re.findall(r'\| (.+)$', r['items'][i]['name'])[0],
r['items'][i]['description'],
r['items'][i]['release_date'],
r['items'][i]['duration_ms'],
r['items'][i]['duration_ms']/60_000])
try:
get_path = r['next']
except KeyError:
break
# Store in DataFrame
df_spotify_ted = pd.DataFrame(spotify_ted,
columns=['full_title_author',
'title',
'author',
'description',
'date_released',
'duration_ms',
'duration_min'])
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 4.1 Tables Present in SQLite Database (ted_talks.db)</b></center>'))
Image(filename="ted_talks_db.png", width=500)
# Connecting to an SQLite database
engine = create_engine('sqlite:///ted_talks.db')
# Creating SQL DB with appropriate data types
df_spotify_ted.to_sql(name='spotify', con=engine, if_exists='replace',
index=False,
dtype={'full_title_author': VARCHAR(length=255),
'title': VARCHAR(length=255),
'author': VARCHAR(length=255),
'description': TEXT,
'date_released': TEXT,
'duration_ms': INTEGER,
'duration_min': FLOAT(precision=6,
asdecimal=True)})
# Creating SQL DB with appropriate data types
df_youtube.to_sql(name='youtube', con=engine, if_exists='replace',
index=False,
dtype={'full_title_author': VARCHAR(length=255),
'title': VARCHAR(length=255),
'author': VARCHAR(length=255),
'description': VARCHAR(length=255),
'published_at': TEXT,
'video_id': VARCHAR(length=255),
'duration': VARCHAR(length=255),
'viewCount': INTEGER,
'likeCount': INTEGER,
'dislikeCount': INTEGER,
'favoriteCount': INTEGER,
'commentCount': INTEGER})
conn = sqlite3.connect('ted_talks.db')
#filter df_spotify and df_youtube, July 1, 2020 to June 30, 2021
df_spotify = pd.read_sql('select * from spotify', conn)
df_spotify['date_released'] = pd.to_datetime(df_spotify['date_released'])
df_spotify = df_spotify.rename(columns={'date_released' : 'upload_date'})
df_spotify = df_spotify.loc[(df_spotify['upload_date'] >= '2020-07-01')
& (df_spotify['upload_date'] <= '2021-06-30')]
df_youtube = pd.read_sql('select * from youtube', conn)
df_youtube['published_at'] = pd.to_datetime(df_youtube['published_at'])\
.dt.strftime('%Y-%m-%d')
df_youtube['published_at'] = pd.to_datetime(df_youtube['published_at'])
df_youtube = df_youtube.rename(columns={'published_at' : 'upload_date'})
df_youtube = df_youtube.loc[(df_youtube['upload_date'] >= '2020-07-01')
& (df_youtube['upload_date'] <= '2021-06-30')]
#convert the duration column in df_youtube, convert PT format to minutes
duration_min = []
for item in df_youtube['duration']:
try:
s = int(''.join(re.findall(r'(\d{1,2}(?=S))', item)))/60
except ValueError:
s = 0
try:
m = int(''.join(re.findall(r'(\d{1,2}(?=M))', item)))
except ValueError:
m = 0
try:
h = int(''.join(re.findall(r'(\d{1,2}(?=H))', item)))*60
except ValueError:
h = 0
new_duration = s+m+h
duration_min.append(new_duration)
df_youtube['duration_min'] = duration_min
#lower case and strip titles for both dataframes, pre-matching
df_spotify['title'] = df_spotify['title'].str.lower()
df_spotify['title'] = df_spotify['title'].str.strip()
df_youtube['title'] = df_youtube['title'].str.lower()
df_youtube['title'] = df_youtube['title'].str.strip()
#create list of titles
sp_titles = list(df_spotify['title'])
yt_titles = list(df_youtube['title'])
common_list = []
for item in sp_titles:
if item in yt_titles:
common_list.append(item)
else:
pass
#matches in df_spotify & df_youtube for overlap
match_spotify = df_spotify[df_spotify['title'].isin(common_list)]
match_youtube = df_youtube[df_youtube['title'].isin(common_list)]
#create df for overlap
df_overlap = pd.merge(left=match_youtube, right=match_spotify,
left_on='title', right_on='title',
suffixes=['_YT','_SP'])
#spotify only dataframe
df_spotify_only = df_spotify[~df_spotify['title'].isin(common_list)]
#youtube only dataframe
df_youtube_only = df_youtube[~df_youtube['title'].isin(common_list)]
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Table 4.1 Dataframe of Titles Found in Both YouTube and Spotify</b></center>'))
display(df_overlap.head())
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Table 4.2 Dataframe of Titles Found Only in YouTube</b></center>'))
display(df_youtube_only.head())
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Table 4.3 Dataframe of Titles Found Only'
'in Spotify</b></center>'))
display(df_spotify_only.head())
# Create main df for all
df_all = pd.merge(left=df_youtube, right=df_spotify,
left_on='title', right_on='title',
suffixes=['_YT','_SP'])
# Create groupby series for YouTube and Spotify count
yt_count = (df_all.groupby(pd.Grouper(key="upload_date_YT", freq="M"))
['title'].count().rename('YouTube count'))
sp_count = (df_all.groupby(pd.Grouper(key="upload_date_SP", freq="M"))
['title'].count().rename('Spotify count'))
yt_sp_count = pd. concat([yt_count, sp_count], axis=1)
# Plot resulting df
fig, ax = plt.subplots()
yt_sp_count.plot(y=['YouTube count', 'Spotify count'], kind="bar",
color=['#d62728', '#2ca02c'], ax=ax)
ax.set_xticklabels([i.strftime("%Y-%m") for i in yt_sp_count.index],
rotation=45)
ax.set_xlabel("Year-Month Uploaded", size=11)
ax.set_ylabel('Upload Count', size=11)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.1 Monthly Upload Count for'
' YouTube and Spotify</b></center>'))
plt.show()
# Create main df for delay in upload.
df_overlap_delay = df_overlap.copy()
df_overlap_delay = df_overlap_delay[['title', 'author_SP', 'upload_date_SP',
'author_YT', 'upload_date_YT']]
df_overlap_delay['delay(YT-SP)'] = (df_overlap_delay['upload_date_YT']
- df_overlap_delay['upload_date_SP'])
conditions = [(df_overlap_delay['delay(YT-SP)'].astype(int) < 0),
(df_overlap_delay['delay(YT-SP)'].astype(int) > 0),
(df_overlap_delay['delay(YT-SP)'].astype(int) == 0)]
values = ['YouTube', 'Spotify', 'Same Day']
df_overlap_delay['first_uploaded'] = np.select(conditions, values)
# Count number of where the content is first uploaded.
df_delay_count = (df_overlap_delay.groupby('first_uploaded')
['delay(YT-SP)'].count().reset_index())
df_delay_count['count'] = df_delay_count['delay(YT-SP)']
df_delay_count = (df_delay_count.drop(columns='delay(YT-SP)')
.sort_values('count'))
# Set palette.
palette_tab10 = np.array(sns.color_palette("tab10"))
palette_delay = np.array([palette_tab10[3],
palette_tab10[1],
palette_tab10[2]])
# Plot the number of where the content is first uploaded.
sns.barplot(data=df_delay_count,
x='first_uploaded',
y='count',
palette=palette_delay)
plt.ylabel('Count')
plt.xlabel('')
plt.gcf().set_size_inches(6, 4)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.2 Count of Where it'
' was First Uploaded</b></center>'))
plt.show()
# create dataframe for mean delay duration and where it was delayed.
div = 86_400000000000
df_overlap_delay['delay_float'] = (df_overlap_delay['delay(YT-SP)']
.values.astype(np.int64)/div)
df_delay_means = df_overlap_delay.groupby('first_uploaded')['delay_float']
df_delay_means = df_delay_means.mean().abs().reset_index()
df_delay_means['mean_delay'] = df_delay_means['delay_float']
df_delay_means = df_delay_means.drop(columns='delay_float').iloc[1:]
df_delay_means = df_delay_means.sort_values('mean_delay')
conditions = [
(df_delay_means['first_uploaded'] == 'YouTube'),
(df_delay_means['first_uploaded'] == 'Spotify')]
values = ['YouTube to Spotify', 'Spotify to YouTube']
df_delay_means['delayed_in'] = np.select(conditions, values)
# Set palette.
palette_delay_means = np.array([palette_tab10[3], palette_tab10[2]])
# Plot delay duration.
sns.barplot(data=df_delay_means, x='delayed_in',
y='mean_delay', palette=palette_delay_means)
plt.ylabel('Mean Delay (days)')
plt.xlabel('')
plt.gcf().set_size_inches(6, 4)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.3 Delay of Upload in Other Site</b></center>'))
plt.show()
df_durations_OL = df_overlap.copy()
diff_dur = []
for i in range(len(df_durations_OL)):
diff = (df_durations_OL['duration_min_SP'][i] -
df_durations_OL['duration_min_YT'][i])
diff_dur.append(diff)
for i in diff_dur:
if i > -10:
pass
else:
diff_dur.remove(i)
sns.barplot(x=np.arange(len(diff_dur)), y=diff_dur, palette='viridis')
plt.ylabel('Duration Difference (mins)')
plt.xticks([])
plt.gcf().set_size_inches(12,6)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.4 Duration Difference of Spotify Episode to '
'Corrresponding YouTube Video</b></center>'))
plt.show()
#create dataframe
df_durations_OL = df_overlap.copy()
df_durations_OL = df_overlap[['title', 'viewCount', 'likeCount',
'dislikeCount', 'commentCount',
'duration_min_YT', 'duration_min_SP',
'upload_date_YT', 'upload_date_SP']]
df_durations_OL = df_durations_OL.dropna()
df_durations_OL.reset_index(inplace=True)
#create columns for number of days uploaded to date
import datetime
date = "2021-08-08"
final = datetime.datetime.strptime(date, '%Y-%m-%d')
days_list = []
for i in range(len(df_durations_OL)):
days_YT = (final - df_durations_OL['upload_date_YT'][i]).days
days_list.append(days_YT)
df_durations_OL.insert(5, "days_onYT", days_list)
average_comments = []
average_likes = []
average_views = []
#create columns for engagement averages
for i in range(len(df_durations_OL)):
ave_comment = (int(df_durations_OL['commentCount'][i]) /
df_durations_OL['days_onYT'][i])
ave_like = (int(df_durations_OL['likeCount'][i]) /
df_durations_OL['days_onYT'][i])
ave_view = (int(df_durations_OL['viewCount'][i]) /
df_durations_OL['days_onYT'][i])
average_comments.append(ave_comment)
average_likes.append(ave_like)
average_views.append(ave_view)
df_durations_OL.insert(5, "daily_comments", average_comments)
df_durations_OL.insert(5, "daily_likes", average_likes)
df_durations_OL.insert(5, "daily_views", average_views)
#plot
df_dur = df_durations_OL.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 70, 10)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min_YT"],
page_range))['daily_views'].count())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_views',
y='duration_min_YT',
data=df_dur, palette='viridis')
plt.gcf().set_size_inches(8,4)
plt.ylabel('Video Duration (mins)')
plt.xlabel('Number of Videos')
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.5 Number of Videos (Both in YouTube & Spotify) '
'per Duration Time Bin</b></center>'))
plt.show()
df_max20_overlap = df_durations_OL.copy()
df_max20_overlap = df_max20_overlap[(df_durations_OL
['duration_min_YT'] <= 20)]
#plot average engagement vs duration
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True, figsize=(15,5))
df_dur = df_max20_overlap.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 22, 2)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min_YT"],
page_range))['daily_likes'].mean())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_likes',
y='duration_min_YT',
data=df_dur, palette='viridis_r',
ax=ax1)
df_dur = df_max20_overlap.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 22, 2)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min_YT"],
page_range))['daily_views'].mean())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_views',
y='duration_min_YT',
data=df_dur, palette='viridis_r',
ax=ax2)
df_dur = df_max20_overlap.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 22, 2)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min_YT"],
page_range))['daily_comments'].mean())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_comments',
y='duration_min_YT',
data=df_dur, palette='viridis_r',
ax=ax3)
ax1.set(xlabel='Average Daily Likes', ylabel='Video Duration (mins)')
ax2.set(xlabel='Average Daily Views', ylabel='')
ax3.set(xlabel='Average Daily Comments', ylabel='')
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.6 Average Engagement Values: Likes, Views & '
'Comments (Both in YouTube & Spotify)</b></center>'))
plt.show()
Meanwhile for those uploaded only as YouTube videos, majority of the video also fall at the 1–20-minute duration range. But in contrast to those uploaded in both platforms, YouTube-only content actually get higher engagement for video in the 2-to-6-minute duration range. This suggests the preference of viewers for shorter content.
#create dataframe
df_durations_YT = df_youtube_only
df_durations_YT = df_youtube_only.drop(['full_title_author',
'description', 'duration',
'favoriteCount', 'author',
'video_id'], axis=1)
df_durations_YT = df_durations_YT.dropna()
df_durations_YT.reset_index(inplace=True)
#create columns for number of days uploaded to date
import datetime
date = "2021-08-08"
final = datetime.datetime.strptime(date, '%Y-%m-%d')
days_list = []
for i in range(len(df_durations_YT)):
days_YT = (final - df_durations_YT['upload_date'][i]).days
days_list.append(days_YT)
df_durations_YT.insert(5, "days_onYT", days_list)
#create columns for engagement averages
average_comments = []
average_likes = []
average_views = []
for i in range(len(df_durations_YT)):
ave_comment = (int(df_durations_YT['commentCount'][i]) /
df_durations_YT['days_onYT'][i])
ave_like = (int(df_durations_YT['likeCount'][i]) /
df_durations_YT['days_onYT'][i])
ave_view = (int(df_durations_YT['viewCount'][i]) /
df_durations_YT['days_onYT'][i])
average_comments.append(ave_comment)
average_likes.append(ave_like)
average_views.append(ave_view)
df_durations_YT.insert(5, "daily_comments", average_comments)
df_durations_YT.insert(5, "daily_likes", average_likes)
df_durations_YT.insert(5, "daily_views", average_views)
# plot
df_dur = df_durations_YT.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 70, 10)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min"],
page_range))['daily_views'].count())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_views',
y='duration_min',
data=df_dur, palette='rocket')
plt.gcf().set_size_inches(8,4)
plt.ylabel('Video Duration (mins)')
plt.xlabel('Number of Videos')
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.7 Number of Videos (Only in YouTube) per Duration '
'Time Bin</b></center>'))
plt.show()
df_max20_youtube = df_durations_YT.copy()
df_max20_youtube = df_max20_youtube[(df_durations_YT['duration_min'] <= 20)]
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True, figsize=(15,5))
df_dur = df_max20_youtube.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 22, 2)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min"],
page_range))['daily_likes'].mean())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_likes',
y='duration_min',
data=df_dur, palette='rocket',
ax=ax1)
df_dur = df_max20_youtube.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 22, 2)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min"],
page_range))['daily_views'].mean())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_views',
y='duration_min',
data=df_dur, palette='rocket',
ax=ax2)
df_dur = df_max20_youtube.copy()
df_dur['dur_range'] = 0
page_range = np.arange(0, 22, 2)
df_dur = (df_dur.groupby(pd.cut(df_dur["duration_min"],
page_range))['daily_comments'].mean())
df_dur = df_dur.reset_index()
sns.barplot(x='daily_comments',
y='duration_min',
data=df_dur, palette='rocket',
ax=ax3)
ax1.set(xlabel='Average Daily Likes', ylabel='Video Duration (mins)')
ax2.set(xlabel='Average Daily Views', ylabel='')
ax3.set(xlabel='Average Daily Comments', ylabel='')
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.8 Average Engagement Values: Likes, Views & '
'Comments (Only in YouTube)</b></center>'))
plt.show()
df_merge_stats = pd.merge(left=df_youtube, right=df_spotify,
on='title', how='left')
df_merge_stats['Uploaded in Spotify?'] = (~df_merge_stats
['full_title_author_y'].isna())
# get needed columns
df_merge_stats = df_merge_stats.loc[:,['video_id',
'viewCount',
'likeCount',
'dislikeCount',
'commentCount',
'Uploaded in Spotify?']]
# convert stats to numeric
# cols = df_merge_stats.columns.drop('video_id')
# df_merge_stats[cols] = (df_merge_stats[cols].apply
# (pd.to_numeric, errors='coerce'))
# plot
fig, ax = plt.subplots(2, 2, figsize=(10,8))
ax = ax.flatten()
stats = ["viewCount",
"likeCount",
"dislikeCount",
"commentCount"]
stat_label = ["Youtube View Count",
"Youtube Like Count",
"Youtube Dislike Count",
"Youtube Comment Count"]
for i in range(len(ax)):
sns.barplot(x="Uploaded in Spotify?",
y=stats[i],
data=df_merge_stats,
palette=['#d62728', '#bcbd22'],
estimator=np.median,ci=None, ax=ax[i])
ax[i].set_xlabel("Channel Uploaded", size=11)
ax[i].set_ylabel(stat_label[i], size=11)
ax[i].set_xticklabels(['Youtube only', 'Youtube and Spotify'])
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.9 Comparison of YouTube'
' engagement statistics between TED Talk content in YouTube only'
' and in both YouTube and Spotify</b></center>'))
# Compute the correlation matrix
corr = df_merge_stats.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(9, 9))
# Generate a custom diverging colormap
# cmap = sns.diverging_palette(230, 20, as_cmap=True)
corr_labels = ['View Count',
'Like Count',
'Dislike Count',
'Comment Count',
'Spotify Upload']
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap="rocket_r",
square=True, linewidths=.5, annot=True,
xticklabels=corr_labels,yticklabels=corr_labels)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.10 Correlation matrix of YouTube engagement'
' statistics including factor of whether content is'
' in YouTube only or both platforms</b></center>'))
plt.show()
speakers_cnt_sp = df_spotify['author'].value_counts().sort_values(ascending=False)
speakers_cnt_yt = df_youtube['author'].value_counts().sort_values(ascending=False)
speakers_cnt_sp_only = df_spotify_only['author'].value_counts().sort_values(ascending=False)
speakers_cnt_yt_only = df_youtube_only['author'].value_counts()
mult_speakers_sp = speakers_cnt_sp[(speakers_cnt_sp > 1) &
(speakers_cnt_sp.index != '')]
mult_speakers_yt = speakers_cnt_yt[(speakers_cnt_yt > 1) &
(speakers_cnt_yt.index != '')]
mult_speakers_sp_only = speakers_cnt_sp_only[speakers_cnt_sp_only>1]
mult_speakers_yt_only = speakers_cnt_yt_only[speakers_cnt_yt_only>1]
# Youtube stats
df_yt_speaker_tag = df_youtube.copy()
speakers_cnt_yt = df_youtube['author'].value_counts()
mult_speakers_yt = speakers_cnt_yt[(speakers_cnt_yt > 1) &
(speakers_cnt_yt.index != '')]
mult_tag_yt = df_youtube['author'].isin(list(mult_speakers_yt.index))
df_yt_speaker_tag.loc[mult_tag_yt, 'speaker_tag'] = 'Multiple'
df_yt_speaker_tag.loc[~mult_tag_yt, 'speaker_tag'] = 'Single'
df_yt_speaker_summ = df_yt_speaker_tag.groupby('speaker_tag')
yt_measures = ['viewCount', 'likeCount', 'dislikeCount', 'favoriteCount',
'commentCount', 'duration_min']
df_yt_speaker_summ = df_yt_speaker_summ[yt_measures].mean()
df_yt_speaker_summ['channel'] = 'YouTube'
# Spotify stats
df_sp_speaker_tag = df_spotify.copy()
speakers_cnt_sp = df_spotify['author'].value_counts().sort_values(ascending=False)
mult_speakers_sp = speakers_cnt_sp[(speakers_cnt_sp > 1) &
(speakers_cnt_sp.index != '')]
mult_tag_sp = df_spotify['author'].isin(list(mult_speakers_sp.index))
df_sp_speaker_tag.loc[mult_tag_sp, 'speaker_tag'] = 'Multiple'
df_sp_speaker_tag.loc[~mult_tag_sp, 'speaker_tag'] = 'Single'
df_sp_speaker_summ = df_sp_speaker_tag.groupby('speaker_tag')
sp_measures = ['duration_ms', 'duration_min']
df_sp_speaker_summ = df_sp_speaker_summ[sp_measures].mean()
df_sp_speaker_summ['channel'] = 'Spotify'
# Combine Summary Statistics
df_yt_speaker_tag['channel'] = 'YouTube'
df_sp_speaker_tag['channel'] = 'Spotify'
df_comb_speaker = pd.concat([df_yt_speaker_tag, df_sp_speaker_tag])
# Set palette.
palette_tab10 = np.array(sns.color_palette("tab10"))
palette_tab10
palette_comb = np.array([palette_tab10[3], palette_tab10[2]])
palette_red = np.array([palette_tab10[3]])
palette_green = np.array([palette_tab10[2]])
fig, ax = plt.subplots(1, 2, figsize=(8,4), sharey=True)
channel = ['YouTube', 'Spotify']
for i in range(len(ax)):
show_channel_tag = df_comb_speaker['channel'] == channel[i]
sns.barplot(x='speaker_tag', y='duration_min',
data=df_comb_speaker[show_channel_tag],
palette=[palette_comb[i]], ci=95,
estimator=len,
ax=ax[i])
ax[i].set_xlabel(channel[i])
ax[i].set_ylabel('')
fig.text(0.05, 0.5, 'Count', va='center', rotation='vertical', size=12)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.11 Number of Videos or Episodes for Speakers with '
'Single or Multiple Talks in YouTube and Spotify </b></center>'))
plt.show()
yt_multiple = sorted(list(mult_speakers_yt.index))
display(HTML('<h3>YouTube Speakers with Multiple Shows</h3>'))
for speaker in yt_multiple:
display(HTML('<li>{}</li>'.format(speaker)))
sp_multiple = sorted(list(mult_speakers_sp.index))
sp_multiple = sorted([speaker.strip() for speaker in sp_multiple])
display(HTML('<h3>Spotify Speakers with Multiple Shows</h3>'))
for speaker in sp_multiple:
display(HTML('<li>{}</li>'.format(speaker)))
fig, ax = plt.subplots(1, 2, figsize=(8,4), sharey=True)
channel = ['YouTube', 'Spotify']
for i in range(len(ax)):
show_channel_tag = df_comb_speaker['channel'] == channel[i]
# display(df_comb_speaker[df_comb_speaker['channel'] == channel[i]].head())
sns.barplot(x='speaker_tag', y='duration_min',
data=df_comb_speaker[show_channel_tag],
palette=[palette_comb[i]], ci=None,
estimator=np.mean,
ax=ax[i])
ax[i].set_xlabel(channel[i])
ax[i].set_ylabel('')
fig.text(0.05, 0.5, 'Minutes', va='center', rotation='vertical', size=12)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.12 Length of Videos or Episodes for Speakers with '
'Single or Multiple Talks in YouTube and Spotify</b></center>'))
plt.show()
fig, ax = plt.subplots(1, 4, figsize=(20,6))
yt_measures_bar = ['viewCount', 'likeCount', 'dislikeCount', 'commentCount']
yt_measures_label = ['Views', 'Likes', 'Dislikes', 'Comments']
for i in range(len(ax)):
sns.barplot(x='speaker_tag', y=yt_measures_bar[i], data=df_yt_speaker_tag,
palette=palette_red, estimator=np.mean, ax=ax[i], ci=None)
ax[i].set_xlabel(yt_measures_label[i])
ax[i].set_ylabel('')
fig.text(0.5, 0.01, 'Speakers with Quantity of Talks in TED YouTube',
ha='center', size=14)
fig.text(0.05, 0.5, 'Count', va='center', rotation='vertical', size=14)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.13 Engagement Count (Views, Likes, Dislikes and '
'Comments for Speakers with Single or Multiple Talks in '
'YouTube</b></center>'))
plt.show()
top_words = ''
tokens_list = []
titles = df_yt_speaker_tag[df_yt_speaker_tag.speaker_tag=='Single'].title
for title in titles:
title = str(title)
tokens = title.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
top_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 200,
background_color ='white',
colormap='OrRd',
max_words=30,
collocations=True,
min_font_size = 10,
max_font_size = 100).generate(top_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.14 Wordcloud for Titles from YouTube Speakers with '
'a Single Episode</b></center>'))
top_words = ''
tokens_list = []
titles = df_yt_speaker_tag[df_yt_speaker_tag.speaker_tag=='Multiple'].title
for title in titles:
title = str(title)
tokens = title.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
top_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 200,
background_color ='white',
colormap='OrRd',
max_words=30,
collocations=True,
min_font_size = 10,
max_font_size = 100).generate(top_words)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.15 Wordcloud for Titles from YouTube Speakers with '
'Multiple Episodes</b></center>'))
top_words = ''
tokens_list = []
titles = df_sp_speaker_tag[df_sp_speaker_tag.speaker_tag=='Single'].title
for title in titles:
title = str(title)
tokens = title.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
top_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 200,
background_color ='white',
colormap='Greens',
max_words=30,
collocations=True,
min_font_size = 10,
max_font_size = 100).generate(top_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.16 Wordcloud for Titles from Spotify Speakers with '
'a Single Episode</b></center>'))
top_words = ''
tokens_list = []
titles = df_sp_speaker_tag[df_sp_speaker_tag.speaker_tag=='Multiple'].title
for title in titles:
title = str(title)
tokens = title.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
top_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 200,
background_color ='white',
colormap='Greens',
max_words=30,
collocations=True,
min_font_size = 10,
max_font_size = 100).generate(top_words)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.17 Wordcloud for Titles from Spotify Speakers with '
'Multiple Episodes</b></center>'))
conn = sqlite3.connect('ted_talks.db')
# Initial data frame
df_YT_TW_initial = pd.read_sql('SELECT * from youtube', conn)
df_YT_TW_initial = df_YT_TW_initial[['video_id', 'title', 'published_at',
'viewCount', 'likeCount',
'dislikeCount']]
# Distribute word per row
df_YT_TW = df_YT_TW_initial.assign(title=df_YT_TW_initial['title'].\
str.lower().str.split()).\
explode('title')
# Format columns
df_YT_TW['title'] = df_YT_TW['title'].str.replace('[^\w\s]','')
df_YT_TW['published_at'] = (pd.to_datetime(
df_YT_TW['published_at'])
).dt.strftime('%b %Y')
df_YT_TW = df_YT_TW.reset_index().iloc[:, 1:]
stop_words = stopwords.words('english')+['could','us','isnt']
# Filtering words
invalid_word_index = []
for i, t in enumerate(df_YT_TW['title']):
if t in stop_words:
invalid_word_index.append(i)
YT_filter = df_YT_TW.drop(df_YT_TW.index[invalid_word_index])
YT_filter = YT_filter[YT_filter['title'] != '']
df_YT_TW_filtered = pd.pivot_table(YT_filter,
index=YT_filter['title'],
columns=YT_filter['published_at'],
aggfunc='size',
fill_value=0)
df_YT_TW_filtered['total_count'] = df_YT_TW_filtered.sum(axis=1)
df_YT_TW_filtered = df_YT_TW_filtered.rename_axis('idx').\
sort_values(by = ['total_count', 'idx'],
ascending = [False, True])
# Top used words bar graph
top10 = df_YT_TW_filtered[:11]
top10_bar = sns.barplot(x='total_count', y=top10.index.values,
data=top10, color='Red', saturation=.5)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.18 Youtube Top 10 Words</b></center>'))
Climate, Change, and Future topped the most common used words on YouTube titles at 27, 22 and 19 mentions respectively. The other most common used words may be bundled and related to the current issue on the pandemic due to the widespread of COVID-19.
# Get video_id from df_YT_TW_initial with titles in top words
pd.set_option('mode.chained_assignment', None)
df_YT_ratings = df_YT_TW_initial[df_YT_TW_initial['title'].str.lower().\
str.contains('|'.join(top10.index.values))]
df_YT_ratings['word'] = df_YT_ratings['title'].str.lower().str.\
findall(r'|'.join(top10.index.values)).\
apply(', '.join)
df_YT_ratings = (df_YT_ratings.assign(word=df_YT_ratings['word'].\
str.split(', ')).explode('word')).reset_index()
df_YT_ratings = df_YT_ratings[['word', 'video_id', 'title', 'viewCount',
'likeCount', 'dislikeCount']]
# Ratings bar graph
YT_ratings_pivot = pd.pivot_table(df_YT_ratings, index='word',
values=['viewCount', 'likeCount', 'dislikeCount'],
aggfunc='mean')
YT_ratings_pivot['rate %'] = (YT_ratings_pivot['likeCount']/
(YT_ratings_pivot['likeCount']+
YT_ratings_pivot['dislikeCount']))*100
top10_rate = sns.barplot(x='rate %', y=YT_ratings_pivot.index.values,
data=YT_ratings_pivot, color='Red', saturation=.5)
top10_rate;
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.19 Relative Ratings per Word</b></center>'))
The audience received videos associated with the words health and life positively, giving them relative ratings at 95.98% and 96.47%.
# Monthly word trend
monthly_top = []
for month in df_YT_TW_filtered.columns[:-2]:
for ct in df_YT_TW_filtered[month]:
if ct == df_YT_TW_filtered[month].max():
idx_name = df_YT_TW_filtered[
df_YT_TW_filtered[month]==ct
].index.values.tolist()
valid_entry = (month, idx_name, ct)
monthly_top.append(valid_entry)
monthly_YT_trend = pd.DataFrame(monthly_top, columns=['month',
'top_word', 'count'])
monthly_YT_trend['month'] = pd.to_datetime(monthly_YT_trend['month'])
monthly_YT_trend = monthly_YT_trend.\
assign(top_word=[i for i in monthly_YT_trend['top_word']]).\
explode('top_word')
monthly_YT_trend = monthly_YT_trend.sort_values('month', ascending=True)
monthly_YT_trend = monthly_YT_trend.drop_duplicates().\
reset_index().iloc[:, 1:]
fig = px.scatter(monthly_YT_trend, x='month', y=None,
size='count',
hover_name='top_word',
color='top_word',
labels={'month' : 'Month_Year'},
color_continuous_scale='reds')
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.20 Monthly Word Trend</b></center>'))
# Initial data frame
df_S_TW_initial = pd.read_sql('SELECT * from spotify', conn)
df_S_TW_initial = df_S_TW_initial[['title', 'date_released']]
# Distribute word per row
df_S_TW = df_S_TW_initial.assign(title=df_S_TW_initial['title'].\
str.lower().str.split()).\
explode('title')
# Format columns
df_S_TW['title'] = df_S_TW['title'].str.replace('[^\w\s]','')
df_S_TW['date_released'] = (pd.to_datetime(
df_S_TW['date_released'])
).dt.strftime('%b %Y')
df_S_TW = df_S_TW.reset_index().iloc[:, 1:]
stop_words = stopwords.words('english')+['could','us','isnt']
# Filtering words
invalid_word_index = []
for i, t in enumerate(df_S_TW['title']):
if t in stop_words:
invalid_word_index.append(i)
S_filter = df_S_TW.drop(df_S_TW.index[invalid_word_index])
S_filter =S_filter[S_filter['title'] != '']
df_S_TW_filtered = pd.pivot_table(S_filter,
index=S_filter['title'],
columns=S_filter['date_released'],
aggfunc='size',
fill_value=0)
df_S_TW_filtered['total_count'] = df_S_TW_filtered.sum(axis=1)
df_S_TW_filtered = df_S_TW_filtered.rename_axis('idx').\
sort_values(by = ['total_count', 'idx'],
ascending = [False, True])
# Top used words bar graph
top10 = df_S_TW_filtered[:11]
top10_bar = sns.barplot(x='total_count', y=top10.index.values,
data=top10, color='Green', saturation=.5)
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.21 Spotify Top 10 Words</b></center>'))
# Monthly word trend
monthly_top = []
for month in df_S_TW_filtered.columns[:-2]:
for ct in df_S_TW_filtered[month]:
if ct == df_S_TW_filtered[month].max():
idx_name = df_S_TW_filtered[
df_S_TW_filtered[month]==ct
].index.values.tolist()
valid_entry = (month, idx_name, ct)
monthly_top.append(valid_entry)
monthly_S_trend = pd.DataFrame(monthly_top, columns=['month',
'top_word', 'count'])
monthly_S_trend['month'] = pd.to_datetime(monthly_S_trend['month'])
monthly_S_trend = monthly_S_trend.\
assign(top_word=[i for i in monthly_S_trend['top_word']]).\
explode('top_word')
monthly_S_trend = monthly_S_trend.sort_values('month', ascending=True)
monthly_S_trend = monthly_S_trend.drop_duplicates().\
reset_index().iloc[:, 1:]
fig = px.scatter(monthly_S_trend, x='month', y=None,
size='count',
hover_name='top_word',
color='top_word',
labels={'month' : 'Month_Year'},
color_continuous_scale='greens')
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()
display(HTML('<center style="font-size:12px;font-style:default;">'
'<b>Figure 5.22 Monthly Word Trend</b></center>'))
June 2020 consists of single used words because TED has just began uploading their podcast on Spotify at that time. Additionally, unlike in YouTube, common used words on Spotify cannot be linked together to form a certain topic like the widespread of COVID-19.
We would like to acknowledge our references for this study: